"""
OpenAlex PDF Downloader

Downloads PDFs from OpenAlex using their API to find open access PDFs.
OpenAlex aggregates open access information from multiple sources.
"""

import re
from typing import Optional
from urllib.parse import urlparse

import requests
from loguru import logger

from .base import BaseDownloader, ContentType, DownloadResult


class OpenAlexDownloader(BaseDownloader):
    """Downloader for OpenAlex papers with open access PDF support."""

    def __init__(
        self, timeout: int = 30, polite_pool_email: Optional[str] = None
    ):
        """
        Initialize OpenAlex downloader.

        Args:
            timeout: Request timeout in seconds
            polite_pool_email: Optional email for polite pool (faster API access)
        """
        super().__init__(timeout)
        self.polite_pool_email = polite_pool_email
        self.base_api_url = "https://api.openalex.org"

    def can_handle(self, url: str) -> bool:
        """Check if URL is from OpenAlex."""
        try:
            hostname = urlparse(url).hostname
            return bool(
                hostname
                and (
                    hostname == "openalex.org"
                    or hostname.endswith(".openalex.org")
                )
            )
        except (ValueError, AttributeError, TypeError):
            return False

    def download(
        self, url: str, content_type: ContentType = ContentType.PDF
    ) -> Optional[bytes]:
        """Download content from OpenAlex."""
        result = self.download_with_result(url, content_type)
        return result.content if result.is_success else None

    def download_with_result(
        self, url: str, content_type: ContentType = ContentType.PDF
    ) -> DownloadResult:
        """Download PDF and return detailed result with skip reason."""
        # Only support PDF downloads for now
        if content_type != ContentType.PDF:
            return DownloadResult(
                skip_reason="Text extraction not yet supported for OpenAlex"
            )

        # Extract work ID from URL
        work_id = self._extract_work_id(url)
        if not work_id:
            return DownloadResult(
                skip_reason="Invalid OpenAlex URL - could not extract work ID"
            )

        logger.info(f"Looking up OpenAlex work: {work_id}")

        # Get work details from API to find PDF URL
        pdf_url = self._get_pdf_url(work_id)

        if not pdf_url:
            return DownloadResult(
                skip_reason="Not open access - no free PDF available"
            )

        # Download the PDF from the open access URL
        logger.info(f"Downloading open access PDF from: {pdf_url}")
        pdf_content = super()._download_pdf(pdf_url)

        if pdf_content:
            return DownloadResult(content=pdf_content, is_success=True)
        else:
            return DownloadResult(
                skip_reason="Open access PDF URL found but download failed"
            )

    def _extract_work_id(self, url: str) -> Optional[str]:
        """
        Extract OpenAlex work ID from URL.

        Handles formats like:
        - https://openalex.org/W123456789
        - https://openalex.org/works/W123456789

        Returns:
            Work ID (e.g., W123456789) or None if not found
        """
        # Use urlparse for more robust URL handling (handles query strings, fragments)
        parsed = urlparse(url)
        if not parsed.netloc or "openalex.org" not in parsed.netloc:
            return None

        # Extract work ID from path (W followed by digits)
        # Handles /works/W123 or /W123
        path = parsed.path
        match = re.search(r"(?:/works/)?(W\d+)", path)
        return match.group(1) if match else None

    def _get_pdf_url(self, work_id: str) -> Optional[str]:
        """
        Get open access PDF URL from OpenAlex API.

        Args:
            work_id: OpenAlex work ID (e.g., W123456789)

        Returns:
            PDF URL if available, None otherwise
        """
        try:
            # Construct API request
            api_url = f"{self.base_api_url}/works/{work_id}"
            params = {"select": "id,open_access,best_oa_location"}

            # Add polite pool email if available (gets faster API access)
            headers = {}
            if self.polite_pool_email:
                headers["User-Agent"] = f"mailto:{self.polite_pool_email}"

            # Make API request
            response = self.session.get(
                api_url, params=params, headers=headers, timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()

                # Check if it's open access
                open_access_info = data.get("open_access", {})
                is_oa = open_access_info.get("is_oa", False)

                if not is_oa:
                    logger.info(f"Work {work_id} is not open access")
                    return None

                # Get PDF URL from best open access location
                best_oa_location = data.get("best_oa_location", {})
                if best_oa_location:
                    # Try pdf_url first, fall back to landing_page_url
                    pdf_url = best_oa_location.get("pdf_url")
                    if pdf_url:
                        logger.info(
                            f"Found open access PDF for work {work_id}: {pdf_url}"
                        )
                        return pdf_url

                    # Some works have landing page but no direct PDF
                    landing_url = best_oa_location.get("landing_page_url")
                    if landing_url:
                        logger.info(
                            f"Found landing page for work {work_id}: {landing_url}"
                        )
                        # Validate that landing page is actually a PDF before returning
                        try:
                            head_response = self.session.head(
                                landing_url,
                                timeout=self.timeout,
                                allow_redirects=True,
                            )
                            content_type = head_response.headers.get(
                                "Content-Type", ""
                            ).lower()
                            if "application/pdf" in content_type:
                                logger.info(
                                    f"Landing page is a direct PDF link for work {work_id}"
                                )
                                return landing_url
                            else:
                                logger.info(
                                    f"Landing page is not a PDF (Content-Type: {content_type}), skipping"
                                )
                        except Exception:
                            logger.exception(
                                f"Failed to validate landing page URL for work {work_id}"
                            )

                logger.info(
                    f"No PDF URL available for open access work {work_id}"
                )
                return None

            elif response.status_code == 404:
                logger.warning(f"Work not found in OpenAlex: {work_id}")
                return None
            else:
                logger.warning(f"OpenAlex API error: {response.status_code}")
                return None

        except requests.exceptions.RequestException:
            logger.exception("Failed to query OpenAlex API")
            return None
        except ValueError:
            # JSON decode errors are expected runtime errors
            logger.exception("Failed to parse OpenAlex API response")
            return None
        # Note: KeyError and TypeError are not caught - they indicate programming
        # bugs that should propagate for debugging
